Comparative functional enrichment analysis
between proteins annotated as displaying intensity variation in one of
the four ciliary locations and annotated as stable
# map_cluster_number: # - x: enrichResult or compareClusterResult (from clusterProfiler)# - df: simplifyGO result data.frame with columns ID (term IDs) and Cluster_num# - comp: if TRUE, use x@compareClusterResult, else x@result# Returns a data.frame of all enrich columns + term_size + Cluster_nummap_cluster_number <-function(x, df,comp =FALSE) {## 1. Standardize the input DF’s column namescolnames(df) <-c("ID", "Cluster_num")## 2. Select the correct slot from the clusterProfiler objectif (comp) {# from compareClusterResult results <- x@compareClusterResult } else {# from a single enrichResult results <- x@result }## 3. Get term_size (the numerator of the "BgRatio" string)## e.g. "12/21273" → 12 results$term_size <-as.numeric(sapply(strsplit(results$BgRatio, "/"),function(parts) as.numeric(parts[1])) )## 4. Merge the enrichment results with the cluster assignments merged_results <-merge( results, df,by ="ID",all.x =TRUE )## 5. Return the processed data framereturn(merged_results)}# Function to map Ensembl IDs to gene symbols for a single rowmap_geneID_to_symbol <-function(geneID_str) { geneIDs <-unlist(strsplit(geneID_str, "/")) mapped_ids <-bitr(geneIDs, fromType ="ENSEMBL", toType ="SYMBOL", OrgDb = org.Hs.eg.db)# drop duplicates in ensembl column and keep first occurence mapped_ids <- mapped_ids[!duplicated(mapped_ids$ENSEMBL),] gene_symbols <- mapped_ids$SYMBOLreturn(paste(gene_symbols, collapse ="/"))}# Function to map Ensembl IDs to gene symbols for a single rowmap_geneID_to_name <-function(geneIDs) { mapped_ids <-bitr(geneIDs, fromType ="ENSEMBL", toType =c("GENENAME", "SYMBOL"), OrgDb = org.Hs.eg.db)# drop duplicates in ensembl column and keep first occurence mapped_ids <- mapped_ids[!duplicated(mapped_ids$ENSEMBL),]return(mapped_ids)}
3 Data loading and
preprocessing
3.1 Set input and output
paths
# set input and output pathsin_path <-"/mnt/Data/Projects/Cilia/revision/NonRestricted/data/"out_path <-"/mnt/Data/Projects/Cilia/revision/NonRestricted/analysis/GO_BP/"
# Identify columns that contain "num" in their namesnum_columns <-grep("num", names(df_all), value =TRUE)# Convert all other columns from string to logicaldf_all <- df_all %>%mutate(across(-all_of(num_columns), ~as.logical(.)))# map gene IDs to gene names mapped_ids <-map_geneID_to_name(rownames(df_all))# index by Ensemblrownames(mapped_ids) <- mapped_ids$ENSEMBL# pull out exactly one entry per row of df_alldf_all$GeneSymbol <- mapped_ids[ rownames(df_all), "SYMBOL" ]df_all$GeneName <- mapped_ids[ rownames(df_all), "GENENAME" ]# add rownames as column and reset indexdf_all$Ensembl_ID <-rownames(df_all)rownames(df_all) <-NULL# reorder columns to have Ensembl_ID first, Symbol and Gene name first and then all other columnsdf_all <- df_all %>% dplyr::select(Ensembl_ID, GeneSymbol, GeneName, everything())
# combine pc and tip and tzdf_pc_tip_tz <-rbind(df_pc, df_tip, df_tz)# drop duplicates based on Ensembl_ID columndf_pc_tip_tz <- df_pc_tip_tz[!duplicated(df_pc_tip_tz$Ensembl_ID), ]gene_id_pc_tip_tz <- df_pc_tip_tz$Ensembl_ID
4 Compare biological
themes for different locations with variable proteins
4.1 Define variable
genes
# Load the data from the Excel filedf <-read_excel("/mnt/Data/Projects/Cilia/revision/Filtered_Staining_List_combined_exploded.xlsx")# Identify Ensembl IDs for each category by searching substrings in the "Annotation (Intensity)" columntip_variable <-unique(df$`Ensembl id`[grepl("Primary cilium tip", df$`Annotation (Intensity)`)])pc_variable <-unique(df$`Ensembl id`[grepl("Primary cilium", df$`Annotation (Intensity)`)])tz_variable <-unique(df$`Ensembl id`[grepl("Primary cilium transition zone", df$`Annotation (Intensity)`)])# Combine all variable genes and remove duplicatesvariable_genes <-unique(c(pc_variable, tip_variable, tz_variable))# Identify any non-variable genesnon_variable <-setdiff(gene_id_pc_tip_tz, variable_genes)
4.2 Map as columns to
df_all and save as csv file
# add "Intensity variation" columns to df_all with true if in variable genes, false if in non_variable and NA otherwisedf_all <- df_all %>%mutate(Intensity_variation =case_when( Ensembl_ID %in% variable_genes ~TRUE, Ensembl_ID %in% non_variable ~FALSE,TRUE~NA# catch-all case for any other Ensembl IDs ) )# write to filewrite.csv(df_all, file =paste0(out_path, "NonRestricted_all_proteins_enrichment_input_mapping.csv"))
4.3 GO BP enrichment
analysis
# Split data by locationinput_genes <-list(non_variable = non_variable,variable = variable_genes)# check length of each listlapply(input_genes, length)
# save dotplot as svg fileggsave(paste0(out_path, "NonRestricted_comparison_variable_GO_BP_dotplot.svg"), plot =last_plot(), device ="svg", width =14, height =50, limitsize =FALSE)
4.3.2 Visualize overlap
of enriched terms
# extract resultsresults <- comp@compareClusterResult# split by locationvariable <- results[results$Cluster =="variable",]non_variable <- results[results$Cluster =="non_variable",]# Create a list of setsgo_lists <-list(variable = variable$ID,non_variable = non_variable$ID)# Plot the Venn diagramvenn.plot <-venn.diagram(x = go_lists,category.names =c("variable", "non_variable"),filename =NULL,output =TRUE)grid.newpage()grid.draw(venn.plot)
# Save the captured plot as an SVG filesvg(paste0(out_path, "NonRestricted_comparison_variable_GO_BP_venn.svg"), width =6, height =6)grid.draw(venn.plot)dev.off()
## svg
## 2
4.3.3 Filter terms only
enriched in one condition
# calculate intersection of the twounspecific_terms <-intersect(variable$ID, non_variable$ID)# remove all unspecific termsspecific_terms <- results %>%filter(!ID %in% unspecific_terms)# create a copy of compcomp_filtered <- comp# update results in compcomp_filtered@compareClusterResult <- specific_terms
# save dotplot as svg fileggsave(paste0(out_path, "NonRestricted_comparison_variable_GO_BP_dotplot_specific_terms_only.svg"), plot =last_plot(), device ="svg", width =14, height =50, limitsize =FALSE)
# Subset for pc_tipvariable <- comp_filtered@compareClusterResult[ comp_filtered@compareClusterResult$Cluster =="variable", ]# Subset for bb_tznon_variable <- comp_filtered@compareClusterResult[ comp_filtered@compareClusterResult$Cluster =="non_variable", ]# Create new compareClusterResult objects for each subsetcomp_filtered_variable <- comp_filteredcomp_filtered_non_variable <- comp_filteredcomp_filtered_variable@compareClusterResult <- variablecomp_filtered_non_variable@compareClusterResult <- non_variable
4.3.5 Cluster results -
variable proteins
go_id = comp_filtered_variable@compareClusterResult$IDmat =GO_similarity(go_id, ont ='BP', db ='org.Hs.eg.db', measure ="Sim_Relevance_2006" )
4.3.5.1 Plot cluster
heatmap
# Capture the plotheatmap_plot <-grid.grabExpr({ df <-simplifyGO(mat,method ='binary_cut',plot =TRUE,column_title ="GO BP terms only significant in variable proteins",use_raster =FALSE,order_by_size =TRUE,fontsize_range =c(9, 18),max_words =6,word_cloud_grob_param =list(col ='black', max_width =unit(200, "mm")))})# Save the captured plot as an SVG filesvg(paste0(out_path, "NonRestricted_comparison_variable_GO_BP_dotplot_specific_terms_only_variable_heatmap.svg"), width =16, height =9)grid.draw(heatmap_plot)dev.off()
## pdf
## 2
grid.newpage()grid.draw(heatmap_plot)
4.3.5.2 Process and save
results
# add cluster number from GO term clusteringresults <-map_cluster_number(comp_filtered_variable,df = df,comp =TRUE)# Apply the function to each row of the DataFrameresults$GeneSymbol <-sapply(results$geneID, map_geneID_to_symbol)# save results as csv filewrite.csv(results, file =paste0(out_path, "NonRestricted_comparison_variable_GO_BP_dotplot_specific_terms_only_variable_result.csv"))
4.3.6 Cluster results -
non-variable proteins
go_id = comp_filtered_non_variable@compareClusterResult$IDmat =GO_similarity(go_id, ont ='BP', db ='org.Hs.eg.db', measure ="Sim_Relevance_2006" )
4.3.6.1 Plot cluster
heatmap
# Capture the plotheatmap_plot <-grid.grabExpr({ df <-simplifyGO(mat,method ='binary_cut',plot =TRUE,column_title ="GO BP terms only significant in non-variable proteins",use_raster =FALSE,order_by_size =TRUE,fontsize_range =c(18, 36),max_words =6,word_cloud_grob_param =list(col ='black', max_width =unit(200, "mm")))})# Save the captured plot as an SVG filesvg(paste0(out_path, "NonRestricted_comparison_variable_GO_BP_dotplot_specific_terms_only_nonvariable_heatmap.svg"), width =16, height =9)grid.draw(heatmap_plot)dev.off()
## pdf
## 2
grid.newpage()grid.draw(heatmap_plot)
4.3.6.2 Process and save
results
# add cluster number from GO term clusteringresults <-map_cluster_number(comp_filtered_non_variable,df = df,comp =TRUE)# Apply the function to each row of the DataFrameresults$GeneSymbol <-sapply(results$geneID, map_geneID_to_symbol)# save results as csv filewrite.csv(results, file =paste0(out_path, "NonRestricted_comparison_variable_GO_BP_dotplot_specific_terms_only_nonvariable_result.csv"))
4.3.7 Filter for terms
enriched in both variable and non-variable
# get results as data frame# extract resultsresults <- comp@compareClusterResult# split by locationvariable <- results[results$Cluster =="variable",]non_variable <- results[results$Cluster =="non_variable",]# calculate intersection of the twounspecific_terms <-intersect(variable$ID, non_variable$ID)# remove all unspecific termsspecific_terms <- results %>%filter(!ID %in% unspecific_terms)unspecific_terms <- results %>%filter(ID %in% unspecific_terms)# create a copy of compcomp_filtered <- comp# update results in compcomp_filtered@compareClusterResult <- unspecific_terms
# save dotplot as svg fileggsave(paste0(out_path, "NonRestricted_comparison_variable_GO_BP_dotplot_shared_terms_only.svg"), plot =last_plot(), device ="svg", width =14, height =10, limitsize =FALSE)
4.3.9 Cluster results -
Both locations
# Split by locationvariable <- comp_filtered@compareClusterResult[comp_filtered@compareClusterResult$Cluster =="variable", ]non_variable <- comp_filtered@compareClusterResult[comp_filtered@compareClusterResult$Cluster =="non_variable", ]# Create new compareClusterResult objects for each subsetcomp_filtered_variable<- comp_filteredcomp_filtered_non_variable <- comp_filteredcomp_filtered_variable@compareClusterResult <- variablecomp_filtered_non_variable@compareClusterResult <- non_variable
4.3.9.1 Plot cluster
heatmap
go_id = comp_filtered_variable@compareClusterResult$IDmat =GO_similarity(go_id, ont ='BP', db ='org.Hs.eg.db', measure ="Sim_Relevance_2006" )
# Capture the plotheatmap_plot <-grid.grabExpr({ df <-simplifyGO(mat,method ='binary_cut',plot =TRUE,column_title ="GO BP terms significant in both locations",use_raster =FALSE,order_by_size =TRUE,fontsize_range =c(18, 36),max_words =6,word_cloud_grob_param =list(col ='black', max_width =unit(200, "mm")))})# Save the captured plot as an SVG filesvg(paste0(out_path, "NonRestricted_comparison_variable_GO_BP_dotplot_shared_terms_heatmap.svg"), width =16, height =9)grid.draw(heatmap_plot)dev.off()
## pdf
## 2
grid.newpage()grid.draw(heatmap_plot)
4.3.9.2 Process and save
results
# add cluster number from GO term clusteringresults <-map_cluster_number(comp_filtered_variable,df = df,comp =TRUE)# Apply the function to each row of the DataFrameresults$GeneSymbol <-sapply(results$geneID, map_geneID_to_symbol)# save results as csv filewrite.csv(results, file =paste0(out_path, "NonRestricted_comparison_variable_GO_BP_dotplot_shared_terms_result_variable.csv"))